In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
train_beneficiary = pd.read_csv("/content/Train_Beneficiarydata-1542865627584.csv")
train_inpatient = pd.read_csv("/content/Train_Inpatientdata-1542865627584.csv")
train_outpatient = pd.read_csv("/content/Train_Outpatientdata-1542865627584.csv")
train_provider = pd.read_csv("/content/Train-1542865627584.csv") # Assuming this is the provider labels file
unseen_beneficiary = pd.read_csv("/content/Unseen_Beneficiarydata-1542969243754.csv")
unseen_inpatient = pd.read_csv("/content/Unseen_Inpatientdata-1542969243754.csv")
unseen_outpatient = pd.read_csv("/content/Unseen_Outpatientdata-1542969243754.csv")
unseen_providers = pd.read_csv("/content/Unseen-1542969243754.csv")
print("Training Beneficiary Data Shape:", train_beneficiary.shape)
print("Training Inpatient Data Shape:", train_inpatient.shape)
print("Training Outpatient Data Shape:", train_outpatient.shape)
print("Training Provider Data Shape:", train_provider.shape)
print("Unseen Beneficiary Data Shape:", unseen_beneficiary.shape)
print("Unseen Inpatient Data Shape:", unseen_inpatient.shape)
print("Unseen Outpatient Data Shape:", unseen_outpatient.shape)
print("Unseen Providers Data Shape:", unseen_providers.shape)
Training Beneficiary Data Shape: (138556, 25) Training Inpatient Data Shape: (40474, 30) Training Outpatient Data Shape: (517737, 27) Training Provider Data Shape: (5410, 2) Unseen Beneficiary Data Shape: (63968, 25) Unseen Inpatient Data Shape: (9551, 30) Unseen Outpatient Data Shape: (125841, 27) Unseen Providers Data Shape: (1353, 1)
In [20]:
train_claims = pd.concat([train_inpatient, train_outpatient], ignore_index=True)
train_merged = pd.merge(train_claims, train_beneficiary, on='BeneID', how='left')
train_df = pd.merge(train_merged, train_provider, on='Provider', how='left')
print("Merged Training Data Shape:", train_df.shape)
print(train_df.head())
print("\nValue counts of 'PotentialFraud':")
print(train_df['PotentialFraud'].value_counts())
Merged Training Data Shape: (558211, 55)
BeneID ClaimID ClaimStartDt ClaimEndDt Provider \
0 BENE11001 CLM46614 2009-04-12 2009-04-18 PRV55912
1 BENE11001 CLM66048 2009-08-31 2009-09-02 PRV55907
2 BENE11001 CLM68358 2009-09-17 2009-09-20 PRV56046
3 BENE11011 CLM38412 2009-02-14 2009-02-22 PRV52405
4 BENE11014 CLM63689 2009-08-13 2009-08-30 PRV56614
InscClaimAmtReimbursed AttendingPhysician OperatingPhysician \
0 26000 PHY390922 NaN
1 5000 PHY318495 PHY318495
2 5000 PHY372395 NaN
3 5000 PHY369659 PHY392961
4 10000 PHY379376 PHY398258
OtherPhysician AdmissionDt ... ChronicCond_Diabetes \
0 NaN 2009-04-12 ... 1
1 NaN 2009-08-31 ... 1
2 PHY324689 2009-09-17 ... 1
3 PHY349768 2009-02-14 ... 1
4 NaN 2009-08-13 ... 2
ChronicCond_IschemicHeart ChronicCond_Osteoporasis \
0 1 2
1 1 2
2 1 2
3 2 2
4 1 2
ChronicCond_rheumatoidarthritis ChronicCond_stroke IPAnnualReimbursementAmt \
0 1 1 36000
1 1 1 36000
2 1 1 36000
3 1 1 5000
4 2 2 21260
IPAnnualDeductibleAmt OPAnnualReimbursementAmt OPAnnualDeductibleAmt \
0 3204 60 70
1 3204 60 70
2 3204 60 70
3 1068 250 320
4 2136 120 100
PotentialFraud
0 Yes
1 No
2 No
3 No
4 No
[5 rows x 55 columns]
Value counts of 'PotentialFraud':
PotentialFraud
No 345415
Yes 212796
Name: count, dtype: int64
In [21]:
plt.figure(figsize=(6, 4))
sns.countplot(data=train_df, x='PotentialFraud')
plt.title('Distribution of Potential Fraud Cases')
plt.xlabel('PotentialFraud')
plt.ylabel('Number of Providers')
plt.show()
print("\nProportion of Potential Fraud Cases:")
print(train_df['PotentialFraud'].value_counts(normalize=True))
Proportion of Potential Fraud Cases: PotentialFraud No 0.618789 Yes 0.381211 Name: proportion, dtype: float64
In [22]:
numerical_cols = train_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
# Remove identifier columns and potentially some date-related or aggregated amounts
numerical_cols = [col for col in numerical_cols if col not in ['BeneID', 'ClaimID', 'Provider', 'ClaimStartDt', 'ClaimEndDt',
'AdmissionDt', 'DischargeDt', 'DOB', 'DOD']]
for col in numerical_cols:
if col in train_df.columns:
# 1. Check data type and convert if needed.. Basically we'll perform EDA
if train_df[col].dtype not in ['int64', 'float64']:
try:
train_df[col] = pd.to_numeric(train_df[col], errors='coerce') # Convert to numeric, NaNs for failures
print(f"Converted column '{col}' to numeric.")
except ValueError:
print(f"Column '{col}' cannot be converted to numeric. Skipping boxplot.")
continue # Skip to the next column
unique_value_count = train_df[col].nunique()
non_nan_count = train_df[col].count()
if unique_value_count < 2 or non_nan_count == 0:
print(f"Column '{col}' has too few unique values or all NaNs. Skipping boxplot.")
continue
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.histplot(train_df[col].dropna(), kde=True)
plt.title(f'Distribution of {col}')
plt.subplot(1, 2, 2)
sns.boxplot(x='PotentialFraud', y=col, data=train_df)
plt.title(f'{col} by Potential Fraud')
plt.tight_layout()
plt.show()
print(f"\nDescriptive Statistics for {col} by Potential Fraud:")
print(train_df.groupby('PotentialFraud')[col].describe())
else:
print(f"Column '{col}' not found in train_df.")
Descriptive Statistics for InscClaimAmtReimbursed by Potential Fraud:
count mean std min 25% 50% 75% \
PotentialFraud
No 345415.0 755.213352 3056.460166 0.0 40.0 80.0 300.0
Yes 212796.0 1389.505066 4785.074685 0.0 40.0 90.0 400.0
max
PotentialFraud
No 125000.0
Yes 125000.0
Descriptive Statistics for DeductibleAmtPaid by Potential Fraud:
count mean std min 25% 50% 75% max
PotentialFraud
No 345024.0 54.282430 228.997887 0.0 0.0 0.0 0.0 1068.0
Yes 212288.0 117.652769 330.762589 0.0 0.0 0.0 0.0 1068.0
Descriptive Statistics for ClmProcedureCode_1 by Potential Fraud:
count mean std min 25% 50% \
PotentialFraud
No 9786.0 5866.282240 3060.127143 13.0 3818.0 5188.0
Yes 13524.0 5917.770334 3043.427785 11.0 3891.0 5451.0
75% max
PotentialFraud
No 8674.0 9999.0
Yes 8659.0 9999.0
Descriptive Statistics for ClmProcedureCode_2 by Potential Fraud:
count mean std min 25% 50% \
PotentialFraud
No 2244.0 4132.491979 2019.352638 42.0 2752.0 4019.0
Yes 3246.0 4088.291436 2040.207424 42.0 2724.0 4019.0
75% max
PotentialFraud
No 4439.0 9999.0
Yes 4439.0 9999.0
Descriptive Statistics for ClmProcedureCode_3 by Potential Fraud:
count mean std min 25% 50% \
PotentialFraud
No 406.0 4176.603448 2298.686770 185.0 2724.0 4019.0
Yes 563.0 4253.229130 2271.136731 42.0 2724.0 4019.0
75% max
PotentialFraud
No 4547.0 9982.0
Yes 5234.0 9999.0
Descriptive Statistics for ClmProcedureCode_4 by Potential Fraud:
count mean std min 25% 50% \
PotentialFraud
No 42.0 4165.285714 2382.664877 42.0 2730.25 4019.0
Yes 76.0 4017.750000 1834.577155 185.0 2761.75 4019.0
75% max
PotentialFraud
No 4998.5 9986.0
Yes 4439.0 9974.0
Descriptive Statistics for ClmProcedureCode_5 by Potential Fraud:
count mean std min 25% 50% \
PotentialFraud
No 3.0 3761.666667 909.717722 2724.0 3431.5 4139.0
Yes 6.0 6023.333333 3160.536073 2724.0 4139.0 4662.0
75% max
PotentialFraud
No 4280.5 4422.0
Yes 8774.5 9982.0
Column 'ClmProcedureCode_6' has too few unique values or all NaNs. Skipping boxplot.
Descriptive Statistics for Gender by Potential Fraud:
count mean std min 25% 50% 75% max
PotentialFraud
No 345415.0 1.579017 0.493718 1.0 1.0 2.0 2.0 2.0
Yes 212796.0 1.578549 0.493793 1.0 1.0 2.0 2.0 2.0
Descriptive Statistics for Race by Potential Fraud:
count mean std min 25% 50% 75% max
PotentialFraud
No 345415.0 1.241223 0.689281 1.0 1.0 1.0 1.0 5.0
Yes 212796.0 1.277392 0.760395 1.0 1.0 1.0 1.0 5.0
Descriptive Statistics for State by Potential Fraud:
count mean std min 25% 50% 75% max
PotentialFraud
No 345415.0 25.946276 15.085346 1.0 13.0 25.0 38.0 54.0
Yes 212796.0 24.636483 15.331032 1.0 10.0 23.0 37.0 54.0
Descriptive Statistics for County by Potential Fraud:
count mean std min 25% 50% 75% \
PotentialFraud
No 345415.0 376.183579 271.528928 0.0 140.0 340.0 570.0
Yes 212796.0 382.491419 254.586720 0.0 160.0 360.0 570.0
max
PotentialFraud
No 999.0
Yes 999.0
Descriptive Statistics for NoOfMonths_PartACov by Potential Fraud:
count mean std min 25% 50% 75% max
PotentialFraud
No 345415.0 11.927965 0.911568 0.0 12.0 12.0 12.0 12.0
Yes 212796.0 11.937165 0.853015 0.0 12.0 12.0 12.0 12.0
Descriptive Statistics for NoOfMonths_PartBCov by Potential Fraud:
count mean std min 25% 50% 75% max
PotentialFraud
No 345415.0 11.937562 0.795000 0.0 12.0 12.0 12.0 12.0
Yes 212796.0 11.940732 0.770898 0.0 12.0 12.0 12.0 12.0
Descriptive Statistics for ChronicCond_Alzheimer by Potential Fraud:
count mean std min 25% 50% 75% max
PotentialFraud
No 345415.0 1.600538 0.489788 1.0 1.0 2.0 2.0 2.0
Yes 212796.0 1.594226 0.491042 1.0 1.0 2.0 2.0 2.0
Descriptive Statistics for ChronicCond_Heartfailure by Potential Fraud:
count mean std min 25% 50% 75% max
PotentialFraud
No 345415.0 1.413063 0.492385 1.0 1.0 1.0 2.0 2.0
Yes 212796.0 1.403908 0.490681 1.0 1.0 1.0 2.0 2.0
Descriptive Statistics for ChronicCond_KidneyDisease by Potential Fraud:
count mean std min 25% 50% 75% max
PotentialFraud
No 345415.0 1.594294 0.491029 1.0 1.0 2.0 2.0 2.0
Yes 212796.0 1.577779 0.493915 1.0 1.0 2.0 2.0 2.0
Descriptive Statistics for ChronicCond_Cancer by Potential Fraud:
count mean std min 25% 50% 75% max
PotentialFraud
No 345415.0 1.849668 0.357397 1.0 2.0 2.0 2.0 2.0
Yes 212796.0 1.846905 0.360080 1.0 2.0 2.0 2.0 2.0
Descriptive Statistics for ChronicCond_ObstrPulmonary by Potential Fraud:
count mean std min 25% 50% 75% max
PotentialFraud
No 345415.0 1.691464 0.461890 1.0 1.0 2.0 2.0 2.0
Yes 212796.0 1.679938 0.466501 1.0 1.0 2.0 2.0 2.0
Descriptive Statistics for ChronicCond_Depression by Potential Fraud:
count mean std min 25% 50% 75% max
PotentialFraud
No 345415.0 1.565375 0.495708 1.0 1.0 2.0 2.0 2.0
Yes 212796.0 1.564898 0.495772 1.0 1.0 2.0 2.0 2.0
Descriptive Statistics for ChronicCond_Diabetes by Potential Fraud:
count mean std min 25% 50% 75% max
PotentialFraud
No 345415.0 1.296516 0.456722 1.0 1.0 1.0 2.0 2.0
Yes 212796.0 1.291505 0.454456 1.0 1.0 1.0 2.0 2.0
Descriptive Statistics for ChronicCond_IschemicHeart by Potential Fraud:
count mean std min 25% 50% 75% max
PotentialFraud
No 345415.0 1.243467 0.429175 1.0 1.0 1.0 1.0 2.0
Yes 212796.0 1.236301 0.424810 1.0 1.0 1.0 1.0 2.0
Descriptive Statistics for ChronicCond_Osteoporasis by Potential Fraud:
count mean std min 25% 50% 75% max
PotentialFraud
No 345415.0 1.682784 0.465393 1.0 1.0 2.0 2.0 2.0
Yes 212796.0 1.681653 0.465836 1.0 1.0 2.0 2.0 2.0
Descriptive Statistics for ChronicCond_rheumatoidarthritis by Potential Fraud:
count mean std min 25% 50% 75% max
PotentialFraud
No 345415.0 1.690645 0.462229 1.0 1.0 2.0 2.0 2.0
Yes 212796.0 1.685882 0.464165 1.0 1.0 2.0 2.0 2.0
Descriptive Statistics for ChronicCond_stroke by Potential Fraud:
count mean std min 25% 50% 75% max
PotentialFraud
No 345415.0 1.900453 0.299395 1.0 2.0 2.0 2.0 2.0
Yes 212796.0 1.894754 0.306871 1.0 2.0 2.0 2.0 2.0
Descriptive Statistics for IPAnnualReimbursementAmt by Potential Fraud:
count mean std min 25% 50% 75% \
PotentialFraud
No 345415.0 4903.940535 11388.465739 -8000.0 0.0 0.0 5000.0
Yes 212796.0 5753.945375 12386.869727 -500.0 0.0 0.0 7000.0
max
PotentialFraud
No 155600.0
Yes 161470.0
Descriptive Statistics for IPAnnualDeductibleAmt by Potential Fraud:
count mean std min 25% 50% 75% \
PotentialFraud
No 345415.0 534.962431 1137.325961 0.0 0.0 0.0 1068.0
Yes 212796.0 623.612559 1242.152668 0.0 0.0 0.0 1068.0
max
PotentialFraud
No 38272.0
Yes 38272.0
Descriptive Statistics for OPAnnualReimbursementAmt by Potential Fraud:
count mean std min 25% 50% \
PotentialFraud
No 345415.0 2271.897486 3834.023483 -50.0 470.0 1170.0
Yes 212796.0 2288.496870 3958.231143 -70.0 460.0 1160.0
75% max
PotentialFraud
No 2600.0 102960.0
Yes 2570.0 102960.0
Descriptive Statistics for OPAnnualDeductibleAmt by Potential Fraud:
count mean std min 25% 50% 75% \
PotentialFraud
No 345415.0 647.402655 990.644469 0.0 120.0 340.0 790.0
Yes 212796.0 653.425807 1020.208428 0.0 120.0 340.0 800.0
max
PotentialFraud
No 13840.0
Yes 13840.0
In [23]:
numerical_cols = train_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical_cols = [col for col in numerical_cols if col not in ['BeneID', 'ClaimID', 'Provider', 'ClaimStartDt', 'ClaimEndDt',
'AdmissionDt', 'DischargeDt', 'DOB', 'DOD']]
for col in numerical_cols:
if col in train_df.columns:
if train_df[col].dtype not in ['int64', 'float64']:
try:
train_df[col] = pd.to_numeric(train_df[col], errors='coerce')
print(f"Converted column '{col}' to numeric.")
except ValueError:
print(f"Column '{col}' cannot be converted to numeric. Skipping boxplot.")
continue
#Check for too few unique values or all NaNs
unique_value_count = train_df[col].nunique()
non_nan_count = train_df[col].count()
if unique_value_count < 2 or non_nan_count == 0:
print(f"Column '{col}' has too few unique values or all NaNs. Skipping boxplot.")
continue
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.histplot(train_df[col].dropna(), kde=True)
plt.title(f'Distribution of {col}')
plt.subplot(1, 2, 2)
sns.boxplot(x='PotentialFraud', y=col, data=train_df)
plt.title(f'{col} by Potential Fraud')
plt.tight_layout()
plt.show()
print(f"\nDescriptive Statistics for {col} by Potential Fraud:")
print(train_df.groupby('PotentialFraud')[col].describe())
else:
print(f"Column '{col}' not found in train_df.")
Descriptive Statistics for InscClaimAmtReimbursed by Potential Fraud:
count mean std min 25% 50% 75% \
PotentialFraud
No 345415.0 755.213352 3056.460166 0.0 40.0 80.0 300.0
Yes 212796.0 1389.505066 4785.074685 0.0 40.0 90.0 400.0
max
PotentialFraud
No 125000.0
Yes 125000.0
Descriptive Statistics for DeductibleAmtPaid by Potential Fraud:
count mean std min 25% 50% 75% max
PotentialFraud
No 345024.0 54.282430 228.997887 0.0 0.0 0.0 0.0 1068.0
Yes 212288.0 117.652769 330.762589 0.0 0.0 0.0 0.0 1068.0
Descriptive Statistics for ClmProcedureCode_1 by Potential Fraud:
count mean std min 25% 50% \
PotentialFraud
No 9786.0 5866.282240 3060.127143 13.0 3818.0 5188.0
Yes 13524.0 5917.770334 3043.427785 11.0 3891.0 5451.0
75% max
PotentialFraud
No 8674.0 9999.0
Yes 8659.0 9999.0
Descriptive Statistics for ClmProcedureCode_2 by Potential Fraud:
count mean std min 25% 50% \
PotentialFraud
No 2244.0 4132.491979 2019.352638 42.0 2752.0 4019.0
Yes 3246.0 4088.291436 2040.207424 42.0 2724.0 4019.0
75% max
PotentialFraud
No 4439.0 9999.0
Yes 4439.0 9999.0
Descriptive Statistics for ClmProcedureCode_3 by Potential Fraud:
count mean std min 25% 50% \
PotentialFraud
No 406.0 4176.603448 2298.686770 185.0 2724.0 4019.0
Yes 563.0 4253.229130 2271.136731 42.0 2724.0 4019.0
75% max
PotentialFraud
No 4547.0 9982.0
Yes 5234.0 9999.0
Descriptive Statistics for ClmProcedureCode_4 by Potential Fraud:
count mean std min 25% 50% \
PotentialFraud
No 42.0 4165.285714 2382.664877 42.0 2730.25 4019.0
Yes 76.0 4017.750000 1834.577155 185.0 2761.75 4019.0
75% max
PotentialFraud
No 4998.5 9986.0
Yes 4439.0 9974.0
Descriptive Statistics for ClmProcedureCode_5 by Potential Fraud:
count mean std min 25% 50% \
PotentialFraud
No 3.0 3761.666667 909.717722 2724.0 3431.5 4139.0
Yes 6.0 6023.333333 3160.536073 2724.0 4139.0 4662.0
75% max
PotentialFraud
No 4280.5 4422.0
Yes 8774.5 9982.0
Column 'ClmProcedureCode_6' has too few unique values or all NaNs. Skipping boxplot.
Descriptive Statistics for Gender by Potential Fraud:
count mean std min 25% 50% 75% max
PotentialFraud
No 345415.0 1.579017 0.493718 1.0 1.0 2.0 2.0 2.0
Yes 212796.0 1.578549 0.493793 1.0 1.0 2.0 2.0 2.0
Descriptive Statistics for Race by Potential Fraud:
count mean std min 25% 50% 75% max
PotentialFraud
No 345415.0 1.241223 0.689281 1.0 1.0 1.0 1.0 5.0
Yes 212796.0 1.277392 0.760395 1.0 1.0 1.0 1.0 5.0
Descriptive Statistics for State by Potential Fraud:
count mean std min 25% 50% 75% max
PotentialFraud
No 345415.0 25.946276 15.085346 1.0 13.0 25.0 38.0 54.0
Yes 212796.0 24.636483 15.331032 1.0 10.0 23.0 37.0 54.0
Descriptive Statistics for County by Potential Fraud:
count mean std min 25% 50% 75% \
PotentialFraud
No 345415.0 376.183579 271.528928 0.0 140.0 340.0 570.0
Yes 212796.0 382.491419 254.586720 0.0 160.0 360.0 570.0
max
PotentialFraud
No 999.0
Yes 999.0
Descriptive Statistics for NoOfMonths_PartACov by Potential Fraud:
count mean std min 25% 50% 75% max
PotentialFraud
No 345415.0 11.927965 0.911568 0.0 12.0 12.0 12.0 12.0
Yes 212796.0 11.937165 0.853015 0.0 12.0 12.0 12.0 12.0
Descriptive Statistics for NoOfMonths_PartBCov by Potential Fraud:
count mean std min 25% 50% 75% max
PotentialFraud
No 345415.0 11.937562 0.795000 0.0 12.0 12.0 12.0 12.0
Yes 212796.0 11.940732 0.770898 0.0 12.0 12.0 12.0 12.0
Descriptive Statistics for ChronicCond_Alzheimer by Potential Fraud:
count mean std min 25% 50% 75% max
PotentialFraud
No 345415.0 1.600538 0.489788 1.0 1.0 2.0 2.0 2.0
Yes 212796.0 1.594226 0.491042 1.0 1.0 2.0 2.0 2.0
Descriptive Statistics for ChronicCond_Heartfailure by Potential Fraud:
count mean std min 25% 50% 75% max
PotentialFraud
No 345415.0 1.413063 0.492385 1.0 1.0 1.0 2.0 2.0
Yes 212796.0 1.403908 0.490681 1.0 1.0 1.0 2.0 2.0
Descriptive Statistics for ChronicCond_KidneyDisease by Potential Fraud:
count mean std min 25% 50% 75% max
PotentialFraud
No 345415.0 1.594294 0.491029 1.0 1.0 2.0 2.0 2.0
Yes 212796.0 1.577779 0.493915 1.0 1.0 2.0 2.0 2.0
Descriptive Statistics for ChronicCond_Cancer by Potential Fraud:
count mean std min 25% 50% 75% max
PotentialFraud
No 345415.0 1.849668 0.357397 1.0 2.0 2.0 2.0 2.0
Yes 212796.0 1.846905 0.360080 1.0 2.0 2.0 2.0 2.0
Descriptive Statistics for ChronicCond_ObstrPulmonary by Potential Fraud:
count mean std min 25% 50% 75% max
PotentialFraud
No 345415.0 1.691464 0.461890 1.0 1.0 2.0 2.0 2.0
Yes 212796.0 1.679938 0.466501 1.0 1.0 2.0 2.0 2.0
Descriptive Statistics for ChronicCond_Depression by Potential Fraud:
count mean std min 25% 50% 75% max
PotentialFraud
No 345415.0 1.565375 0.495708 1.0 1.0 2.0 2.0 2.0
Yes 212796.0 1.564898 0.495772 1.0 1.0 2.0 2.0 2.0
Descriptive Statistics for ChronicCond_Diabetes by Potential Fraud:
count mean std min 25% 50% 75% max
PotentialFraud
No 345415.0 1.296516 0.456722 1.0 1.0 1.0 2.0 2.0
Yes 212796.0 1.291505 0.454456 1.0 1.0 1.0 2.0 2.0
Descriptive Statistics for ChronicCond_IschemicHeart by Potential Fraud:
count mean std min 25% 50% 75% max
PotentialFraud
No 345415.0 1.243467 0.429175 1.0 1.0 1.0 1.0 2.0
Yes 212796.0 1.236301 0.424810 1.0 1.0 1.0 1.0 2.0
Descriptive Statistics for ChronicCond_Osteoporasis by Potential Fraud:
count mean std min 25% 50% 75% max
PotentialFraud
No 345415.0 1.682784 0.465393 1.0 1.0 2.0 2.0 2.0
Yes 212796.0 1.681653 0.465836 1.0 1.0 2.0 2.0 2.0
Descriptive Statistics for ChronicCond_rheumatoidarthritis by Potential Fraud:
count mean std min 25% 50% 75% max
PotentialFraud
No 345415.0 1.690645 0.462229 1.0 1.0 2.0 2.0 2.0
Yes 212796.0 1.685882 0.464165 1.0 1.0 2.0 2.0 2.0
Descriptive Statistics for ChronicCond_stroke by Potential Fraud:
count mean std min 25% 50% 75% max
PotentialFraud
No 345415.0 1.900453 0.299395 1.0 2.0 2.0 2.0 2.0
Yes 212796.0 1.894754 0.306871 1.0 2.0 2.0 2.0 2.0
Descriptive Statistics for IPAnnualReimbursementAmt by Potential Fraud:
count mean std min 25% 50% 75% \
PotentialFraud
No 345415.0 4903.940535 11388.465739 -8000.0 0.0 0.0 5000.0
Yes 212796.0 5753.945375 12386.869727 -500.0 0.0 0.0 7000.0
max
PotentialFraud
No 155600.0
Yes 161470.0
Descriptive Statistics for IPAnnualDeductibleAmt by Potential Fraud:
count mean std min 25% 50% 75% \
PotentialFraud
No 345415.0 534.962431 1137.325961 0.0 0.0 0.0 1068.0
Yes 212796.0 623.612559 1242.152668 0.0 0.0 0.0 1068.0
max
PotentialFraud
No 38272.0
Yes 38272.0
Descriptive Statistics for OPAnnualReimbursementAmt by Potential Fraud:
count mean std min 25% 50% \
PotentialFraud
No 345415.0 2271.897486 3834.023483 -50.0 470.0 1170.0
Yes 212796.0 2288.496870 3958.231143 -70.0 460.0 1160.0
75% max
PotentialFraud
No 2600.0 102960.0
Yes 2570.0 102960.0
Descriptive Statistics for OPAnnualDeductibleAmt by Potential Fraud:
count mean std min 25% 50% 75% \
PotentialFraud
No 345415.0 647.402655 990.644469 0.0 120.0 340.0 790.0
Yes 212796.0 653.425807 1020.208428 0.0 120.0 340.0 800.0
max
PotentialFraud
No 13840.0
Yes 13840.0
In [24]:
categorical_cols = train_df.select_dtypes(include=['object']).columns.tolist()
# Remove identifier and date columns (some date columns might still be here if conversion failed)
categorical_cols = [col for col in categorical_cols if col not in ['BeneID', 'ClaimID', 'Provider',
'ClaimStartDt', 'ClaimEndDt',
'AdmissionDt', 'DischargeDt', 'DOB', 'DOD',
'AttendingPhysician', 'OperatingPhysician',
'OtherPhysician', 'ClmAdmitDiagnosisCode',
'DiagnosisGroupCode',
'ClmDiagnosisCode_1', 'ClmDiagnosisCode_2',
'ClmDiagnosisCode_3', 'ClmDiagnosisCode_4',
'ClmDiagnosisCode_5', 'ClmDiagnosisCode_6',
'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8',
'ClmDiagnosisCode_9', 'ClmDiagnosisCode_10',
'ClmProcedureCode_1', 'ClmProcedureCode_2',
'ClmProcedureCode_3', 'ClmProcedureCode_4',
'ClmProcedureCode_5', 'ClmProcedureCode_6']]
for col in categorical_cols:
if col in train_df.columns:
plt.figure(figsize=(10, 6))
# Check the number of unique values to avoid overcrowding the plot
unique_values = train_df[col].nunique()
if unique_values > 50: # it will Adjust this threshold as needed
print(f"Column '{col}' has {unique_values} unique values. Skipping countplot for visualization.")
# Calculate and print fraud proportions instead--the most important
fraud_proportions = train_df.groupby(col)['PotentialFraud'].value_counts(normalize=True).unstack()
print("\nFraud Proportions:\n", fraud_proportions)
else:
sns.countplot(data=train_df, x=col, hue='PotentialFraud', order=train_df[col].value_counts().index)
plt.title(f'Distribution of {col} by Potential Fraud')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
print(f"\nValue Counts and Fraud Proportions for {col}:")
print(train_df.groupby(col)['PotentialFraud'].value_counts(normalize=True))
else:
print(f"Column '{col}' not found in train_df.")
Value Counts and Fraud Proportions for RenalDiseaseIndicator:
RenalDiseaseIndicator PotentialFraud
0 No 0.620607
Yes 0.379393
Y No 0.611372
Yes 0.388628
Name: proportion, dtype: float64
Value Counts and Fraud Proportions for PotentialFraud: PotentialFraud No 1.0 Yes 1.0 Name: proportion, dtype: float64
In [25]:
categorical_cols = train_df.select_dtypes(include=['object']).columns.tolist()
categorical_cols = [col for col in categorical_cols if col not in ['BeneID', 'ClaimID', 'Provider',
'ClaimStartDt', 'ClaimEndDt',
'AdmissionDt', 'DischargeDt', 'DOB', 'DOD',
'AttendingPhysician', 'OperatingPhysician',
'OtherPhysician', 'ClmAdmitDiagnosisCode',
'DiagnosisGroupCode',
'ClmDiagnosisCode_1', 'ClmDiagnosisCode_2',
'ClmDiagnosisCode_3', 'ClmDiagnosisCode_4',
'ClmDiagnosisCode_5', 'ClmDiagnosisCode_6',
'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8',
'ClmDiagnosisCode_9', 'ClmDiagnosisCode_10',
'ClmProcedureCode_1', 'ClmProcedureCode_2',
'ClmProcedureCode_3', 'ClmProcedureCode_4',
'ClmProcedureCode_5', 'ClmProcedureCode_6']]
for col in categorical_cols:
if col in train_df.columns:
print(f"\n--- Analysis of Categorical Feature: {col} ---")
unique_values = train_df[col].nunique()
# 1. Check Cardinality
if unique_values > 50: # it will Adjust threshold if needed
print(f"Column '{col}' has high cardinality ({unique_values} unique values). Displaying fraud proportions.")
fraud_proportions = train_df.groupby(col)['PotentialFraud'].value_counts(normalize=True).unstack()
print(fraud_proportions)
top_n = 10
top_categories = train_df[col].value_counts().head(top_n).index
top_categories_df = train_df[train_df[col].isin(top_categories)]
print(f"\nTop {top_n} Categories Fraud Proportions:")
print(top_categories_df.groupby(col)['PotentialFraud'].value_counts(normalize=True).unstack())
else:
plt.figure(figsize=(10, 6))
sns.countplot(data=train_df, x=col, hue='PotentialFraud', order=train_df[col].value_counts().index)
plt.title(f'Distribution of {col} by Potential Fraud')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
print("\nOverall Fraud Proportions:")
print(train_df.groupby(col)['PotentialFraud'].value_counts(normalize=True))
else:
print(f"Column '{col}' not found in train_df.")
--- Analysis of Categorical Feature: RenalDiseaseIndicator ---
Overall Fraud Proportions:
RenalDiseaseIndicator PotentialFraud
0 No 0.620607
Yes 0.379393
Y No 0.611372
Yes 0.388628
Name: proportion, dtype: float64
--- Analysis of Categorical Feature: PotentialFraud ---
Overall Fraud Proportions: PotentialFraud No 1.0 Yes 1.0 Name: proportion, dtype: float64
In [26]:
date_cols = ['ClaimStartDt', 'ClaimEndDt', 'DOB', 'DOD']
if 'AdmissionDt' in train_df.columns:
date_cols.append('AdmissionDt')
if 'DischargeDt' in train_df.columns:
date_cols.append('DischargeDt')
for col in date_cols:
if col in train_df.columns:
print(f"\n--- Analysis of Date Feature: {col} ---")
try:
train_df[col] = pd.to_datetime(train_df[col], errors='coerce')
print(f"Converted '{col}' to datetime.")
except ValueError as e:
print(f"Column '{col}' cannot be converted to datetime: {e}. Skipping analysis.")
continue
print("\nBasic Statistics:")
print(train_df[col].describe())
if 'ClaimStartDt' in train_df.columns and 'ClaimEndDt' in train_df.columns: # More robust check
if pd.api.types.is_datetime64_any_dtype(train_df['ClaimStartDt']) and pd.api.types.is_datetime64_any_dtype(train_df['ClaimEndDt']):
train_df['ClaimDuration'] = (train_df['ClaimEndDt'] - train_df['ClaimStartDt']).dt.days
print(f"\nClaim Duration Statistics:")
print(train_df['ClaimDuration'].describe())
plt.figure(figsize=(8, 5))
sns.histplot(train_df['ClaimDuration'].dropna(), kde=True)
plt.title('Distribution of Claim Duration')
plt.show()
plt.figure(figsize=(8, 5))
sns.boxplot(x='PotentialFraud', y='ClaimDuration', data=train_df)
plt.title('Claim Duration by Potential Fraud')
plt.show()
else:
print("ClaimStartDt or ClaimEndDt are not proper datetime columns. Skipping ClaimDuration calculation.")
if col == 'DOB' and 'DOB' in train_df.columns: # Robust check
train_df['Age'] = ((pd.to_datetime('2009-12-31') - train_df['DOB']).dt.days / 365.25).astype(int)
print("\nAge Statistics:")
print(train_df['Age'].describe())
plt.figure(figsize=(8, 5))
sns.histplot(train_df['Age'].dropna(), kde=True)
plt.title('Distribution of Age')
plt.show()
plt.figure(figsize=(8, 5))
sns.boxplot(x='PotentialFraud', y='Age', data=train_df)
plt.title('Age by Potential Fraud')
plt.show()
if col == 'DOD' and 'DOD' in train_df.columns: # Robust check
train_df['DOD_Flag'] = train_df['DOD'].notna().astype(int)
print("\nDistribution of DOD_Flag:")
print(train_df['DOD_Flag'].value_counts(normalize=True))
print("\nFraud Proportions by DOD_Flag:")
print(train_df.groupby('DOD_Flag')['PotentialFraud'].value_counts(normalize=True))
else:
print(f"Column '{col}' not found in train_df.")
--- Analysis of Date Feature: ClaimStartDt --- Converted 'ClaimStartDt' to datetime. Basic Statistics: count 558211 mean 2009-06-24 23:39:21.603766528 min 2008-11-27 00:00:00 25% 2009-03-27 00:00:00 50% 2009-06-23 00:00:00 75% 2009-09-22 00:00:00 max 2009-12-31 00:00:00 Name: ClaimStartDt, dtype: object ClaimStartDt or ClaimEndDt are not proper datetime columns. Skipping ClaimDuration calculation. --- Analysis of Date Feature: ClaimEndDt --- Converted 'ClaimEndDt' to datetime. Basic Statistics: count 558211 mean 2009-06-26 17:07:35.601913600 min 2008-12-28 00:00:00 25% 2009-03-29 00:00:00 50% 2009-06-24 00:00:00 75% 2009-09-23 00:00:00 max 2009-12-31 00:00:00 Name: ClaimEndDt, dtype: object Claim Duration Statistics: count 558211.000000 mean 1.727940 std 4.904984 min 0.000000 25% 0.000000 50% 0.000000 75% 0.000000 max 36.000000 Name: ClaimDuration, dtype: float64
--- Analysis of Date Feature: DOB --- Converted 'DOB' to datetime. Basic Statistics: count 558211 mean 1936-03-07 23:58:27.131890944 min 1909-01-01 00:00:00 25% 1927-07-01 00:00:00 50% 1935-05-01 00:00:00 75% 1941-12-01 00:00:00 max 1983-12-01 00:00:00 Name: DOB, dtype: object Claim Duration Statistics: count 558211.000000 mean 1.727940 std 4.904984 min 0.000000 25% 0.000000 50% 0.000000 75% 0.000000 max 36.000000 Name: ClaimDuration, dtype: float64
Age Statistics: count 558211.000000 mean 73.276820 std 13.011761 min 26.000000 25% 68.000000 50% 74.000000 75% 82.000000 max 100.000000 Name: Age, dtype: float64
--- Analysis of Date Feature: DOD --- Converted 'DOD' to datetime. Basic Statistics: count 4131 mean 2009-08-20 15:32:27.712418048 min 2009-02-01 00:00:00 25% 2009-07-01 00:00:00 50% 2009-09-01 00:00:00 75% 2009-11-01 00:00:00 max 2009-12-01 00:00:00 Name: DOD, dtype: object Claim Duration Statistics: count 558211.000000 mean 1.727940 std 4.904984 min 0.000000 25% 0.000000 50% 0.000000 75% 0.000000 max 36.000000 Name: ClaimDuration, dtype: float64
Distribution of DOD_Flag:
DOD_Flag
0 0.9926
1 0.0074
Name: proportion, dtype: float64
Fraud Proportions by DOD_Flag:
DOD_Flag PotentialFraud
0 No 0.618734
Yes 0.381266
1 No 0.626241
Yes 0.373759
Name: proportion, dtype: float64
--- Analysis of Date Feature: AdmissionDt ---
Converted 'AdmissionDt' to datetime.
Basic Statistics:
count 40474
mean 2009-06-19 17:38:12.493946880
min 2008-11-27 00:00:00
25% 2009-03-20 00:00:00
50% 2009-06-16 00:00:00
75% 2009-09-17 00:00:00
max 2009-12-31 00:00:00
Name: AdmissionDt, dtype: object
Claim Duration Statistics:
count 558211.000000
mean 1.727940
std 4.904984
min 0.000000
25% 0.000000
50% 0.000000
75% 0.000000
max 36.000000
Name: ClaimDuration, dtype: float64
--- Analysis of Date Feature: DischargeDt --- Converted 'DischargeDt' to datetime. Basic Statistics: count 40474 mean 2009-06-25 09:36:02.988585216 min 2009-01-01 00:00:00 25% 2009-03-26 00:00:00 50% 2009-06-22 00:00:00 75% 2009-09-23 00:00:00 max 2009-12-31 00:00:00 Name: DischargeDt, dtype: object Claim Duration Statistics: count 558211.000000 mean 1.727940 std 4.904984 min 0.000000 25% 0.000000 50% 0.000000 75% 0.000000 max 36.000000 Name: ClaimDuration, dtype: float64
In [27]:
def create_date_features(df):
"""
Creates various date-related features from the date columns.
Args:
df (pd.DataFrame): Input DataFrame containing date columns.
Returns:
pd.DataFrame: DataFrame with added date features.
"""
if 'ClaimStartDt' in df.columns:
df['ClaimStartDt'] = pd.to_datetime(df['ClaimStartDt'], errors='coerce')
df['ClaimStart_Month'] = df['ClaimStartDt'].dt.month
df['ClaimStart_Quarter'] = df['ClaimStartDt'].dt.quarter
df['ClaimStart_Year'] = df['ClaimStartDt'].dt.year
df['ClaimStart_DayOfWeek'] = df['ClaimStartDt'].dt.dayofweek # Monday=0, Sunday=6
if 'ClaimEndDt' in df.columns:
df['ClaimEndDt'] = pd.to_datetime(df['ClaimEndDt'], errors='coerce')
df['ClaimEnd_Month'] = df['ClaimEndDt'].dt.month
df['ClaimEnd_Quarter'] = df['ClaimEndDt'].dt.quarter
df['ClaimEnd_Year'] = df['ClaimEndDt'].dt.year
df['ClaimEnd_DayOfWeek'] = df['ClaimEndDt'].dt.dayofweek
if 'AdmissionDt' in df.columns and 'DischargeDt' in df.columns:
df['AdmissionDt'] = pd.to_datetime(df['AdmissionDt'], errors='coerce')
df['DischargeDt'] = pd.to_datetime(df['DischargeDt'], errors='coerce')
df['LengthOfStay'] = (df['DischargeDt'] - df['AdmissionDt']).dt.days
df['Admission_Month'] = df['AdmissionDt'].dt.month
df['Admission_Quarter'] = df['AdmissionDt'].dt.quarter
df['Admission_Year'] = df['AdmissionDt'].dt.year
df['Admission_DayOfWeek'] = df['AdmissionDt'].dt.dayofweek
df['Discharge_Month'] = df['DischargeDt'].dt.month
df['Discharge_Quarter'] = df['DischargeDt'].dt.quarter
df['Discharge_Year'] = df['DischargeDt'].dt.year
df['Discharge_DayOfWeek'] = df['DischargeDt'].dt.dayofweek
if 'DOB' in df.columns:
df['DOB'] = pd.to_datetime(df['DOB'], errors='coerce')
df['Age'] = ((pd.to_datetime('2009-12-31') - df['DOB']).dt.days / 365.25).astype(int)
return df
train_df = create_date_features(train_df)
print("Shape after date feature engineering:", train_df.shape)
print(train_df[['ClaimStart_Month', 'ClaimStart_Year', 'LengthOfStay', 'Age']].head())
Shape after date feature engineering: (558211, 75) ClaimStart_Month ClaimStart_Year LengthOfStay Age 0 4 2009 6.0 66 1 8 2009 2.0 66 2 9 2009 3.0 66 3 2 2009 8.0 95 4 8 2009 17.0 71
In [28]:
def create_claim_code_features(df):
"""
Creates features from diagnosis and procedure codes.
Args:
df (pd.DataFrame): Input DataFrame.
Returns:
pd.DataFrame: DataFrame with added claim code features.
"""
diagnosis_cols = [col for col in df.columns if col.startswith('ClmDiagnosisCode_')]
procedure_cols = [col for col in df.columns if col.startswith('ClmProcedureCode_')]
# 1. Count of Diagnosis Codes
df['DiagnosisCode_Count'] = df[diagnosis_cols].notna().sum(axis=1)
# 2. Count of Procedure Codes
df['ProcedureCode_Count'] = df[procedure_cols].notna().sum(axis=1)
# 3. Principal Diagnosis Code (First one)
if 'ClmDiagnosisCode_1' in df.columns:
df['PrincipalDiagnosis'] = df['ClmDiagnosisCode_1'].str[:3] # We have to keep only the first 3 characters
# 4. Any Diagnosis Code Present Flags
for col in diagnosis_cols:
df[f'{col}_Flag'] = df[col].notna().astype(int)
for col in procedure_cols:
df[f'{col}_Flag'] = df[col].notna().astype(int)
return df
train_df = create_claim_code_features(train_df)
print("Shape after claim code feature engineering:", train_df.shape)
print(train_df[['DiagnosisCode_Count', 'ProcedureCode_Count', 'PrincipalDiagnosis']].head())
Shape after claim code feature engineering: (558211, 94) DiagnosisCode_Count ProcedureCode_Count PrincipalDiagnosis 0 9 0 197 1 3 1 618 2 6 0 296 3 9 1 434 4 9 1 042
In [29]:
import numpy as np # Import numpy for Mathematical operations
def create_beneficiary_features(df):
"""
Creates features from beneficiary-related information (optimized for speed).
Args:
df (pd.DataFrame): Input DataFrame.
Returns:
pd.DataFrame: DataFrame with added beneficiary features.
"""
# 1. Chronic Condition Count (Optimized)
chronic_cols = [col for col in df.columns if col.startswith('Chronic')]
if chronic_cols: # Check if there are any chronic condition columns
df['ChronicCondition_Count'] = (df[chronic_cols] == 'Yes').sum(axis=1)
if 'RenalDiseaseIndicator' in df.columns:
df['RenalDiseaseIndicator_Flag'] = (df['RenalDiseaseIndicator'].str.lower() == 'yes').astype(int)
df.drop(columns=['RenalDiseaseIndicator'], inplace=True, errors='ignore')
if 'DOD_Flag' not in df.columns and 'DOD' in df.columns:
df['DOD_Flag'] = df['DOD'].notna().astype(int)
return df
train_df = create_beneficiary_features(train_df)
print("Shape after beneficiary feature engineering:", train_df.shape)
print(train_df[['ChronicCondition_Count', 'RenalDiseaseIndicator_Flag']].head())
Shape after beneficiary feature engineering: (558211, 95) ChronicCondition_Count RenalDiseaseIndicator_Flag 0 0 0 1 0 0 2 0 0 3 0 0 4 0 0
In [30]:
def select_correlated_features(df, threshold=0.8):
"""
Selects features by removing highly correlated ones.
Args:
df (pd.DataFrame): Input DataFrame.
threshold (float): Correlation threshold.
Returns:
pd.DataFrame: DataFrame with selected features.
list: List of dropped columns.
"""
numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
numerical_cols = [col for col in numerical_cols if col not in ['PotentialFraud']] # Exclude target
corr_matrix = df[numerical_cols].corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [col for col in upper.columns if any(upper[col] > threshold)]
selected_df = df.drop(columns=to_drop, errors='ignore') # Add errors='ignore'
return selected_df, to_drop
train_df_selected_corr, dropped_cols_corr = select_correlated_features(train_df)
print("Shape after correlation-based feature selection:", train_df_selected_corr.shape)
print("Dropped columns (correlation):", dropped_cols_corr)
Shape after correlation-based feature selection: (558211, 73) Dropped columns (correlation): ['OPAnnualDeductibleAmt', 'ClaimDuration', 'ClaimStart_Quarter', 'ClaimEnd_Month', 'ClaimEnd_Quarter', 'ClaimEnd_DayOfWeek', 'LengthOfStay', 'Admission_Month', 'Admission_Quarter', 'Admission_Year', 'Admission_DayOfWeek', 'Discharge_Month', 'Discharge_Quarter', 'Discharge_DayOfWeek', 'ClmDiagnosisCode_4_Flag', 'ClmDiagnosisCode_5_Flag', 'ClmDiagnosisCode_6_Flag', 'ClmDiagnosisCode_7_Flag', 'ClmDiagnosisCode_8_Flag', 'ClmDiagnosisCode_9_Flag', 'ClmDiagnosisCode_10_Flag', 'ClmProcedureCode_1_Flag']
In [31]:
from sklearn.ensemble import RandomForestClassifier
def select_top_features_rf_optimized(df, target_col='PotentialFraud', top_n=20):
"""
Selects the top N features based on Random Forest feature importance (optimized for speed).
Args:
df (pd.DataFrame): Input DataFrame.
target_col (str): Name of the target column.
top_n (int): Number of top features to select.
Returns:
pd.DataFrame: DataFrame with selected features.
list: List of selected feature names.
"""
numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
numerical_cols = [col for col in numerical_cols if col not in [target_col]]
X = df[numerical_cols]
y = df[target_col]
model = RandomForestClassifier(
n_estimators=50, # Reduced number of trees
max_depth=10, # Limited tree depth
max_features='sqrt', # Feature subsampling
n_jobs=-1, # Parallelization
random_state=42,
class_weight='balanced'
)
model.fit(X, y)
importance = model.feature_importances_
feature_importance = pd.DataFrame({'Feature': X.columns, 'Importance': importance})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)
top_features = feature_importance['Feature'].head(top_n).tolist()
selected_df = df[[target_col] + top_features]
return selected_df, top_features
train_df_selected_rf_optimized, top_features_rf_optimized = select_top_features_rf_optimized(train_df)
print("Shape after Random Forest-based feature selection:", train_df_selected_rf_optimized.shape)
print("Top features (Random Forest):", top_features_rf_optimized)
Shape after Random Forest-based feature selection: (558211, 21) Top features (Random Forest): ['State', 'County', 'Admission_Quarter', 'LengthOfStay', 'DeductibleAmtPaid', 'InscClaimAmtReimbursed', 'Discharge_Month', 'Race', 'Discharge_DayOfWeek', 'Admission_Year', 'Admission_DayOfWeek', 'OPAnnualReimbursementAmt', 'OPAnnualDeductibleAmt', 'Admission_Month', 'Age', 'IPAnnualReimbursementAmt', 'Discharge_Quarter', 'ClaimDuration', 'IPAnnualDeductibleAmt', 'ClmProcedureCode_1_Flag']
In [32]:
def prepare_data_for_modeling(df, target_col='PotentialFraud'):
"""
Prepares the data for modeling by handling missing values and encoding categorical features.
Args:
df (pd.DataFrame): Input DataFrame.
target_col (str): Name of the target column.
Returns:
pd.DataFrame: Prepared DataFrame.
list: List of categorical columns that were encoded.
"""
df = df.copy() # Work on a copy to avoid modifying the original DataFrame
# 1. Handle Missing Values
for col in df.columns:
if df[col].isnull().any():
if pd.api.types.is_numeric_dtype(df[col]):
df[col] = df[col].fillna(df[col].median()) # Impute numerical with median(avg)
else:
df[col] = df[col].fillna(df[col].mode()[0]) # Impute categorical with mode
# 2. Encode Categorical Features
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
categorical_cols = [col for col in categorical_cols if col != target_col] # Exclude target
encoded_cols = []
if categorical_cols:
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
encoded_cols = [col for col in df.columns if any(part in col for part in categorical_cols)]
return df, encoded_cols
train_df_prepared, encoded_cols = prepare_data_for_modeling(train_df_selected_rf_optimized)
print("Shape after data preparation:", train_df_prepared.shape)
print("Encoded columns:", encoded_cols)
print("\nPrepared data head:")
print(train_df_prepared.head())
Shape after data preparation: (558211, 21) Encoded columns: [] Prepared data head: PotentialFraud State County Admission_Quarter LengthOfStay \ 0 Yes 39 230 2.0 6.0 1 No 39 230 3.0 2.0 2 No 39 230 3.0 3.0 3 No 1 360 1.0 8.0 4 No 45 780 3.0 17.0 DeductibleAmtPaid InscClaimAmtReimbursed Discharge_Month Race \ 0 1068.0 26000 4.0 1 1 1068.0 5000 9.0 1 2 1068.0 5000 9.0 1 3 1068.0 5000 2.0 2 4 1068.0 10000 8.0 1 Discharge_DayOfWeek ... Admission_DayOfWeek OPAnnualReimbursementAmt \ 0 5.0 ... 6.0 60 1 2.0 ... 0.0 60 2 6.0 ... 3.0 60 3 6.0 ... 5.0 250 4 6.0 ... 3.0 120 OPAnnualDeductibleAmt Admission_Month Age IPAnnualReimbursementAmt \ 0 70 4.0 66 36000 1 70 8.0 66 36000 2 70 9.0 66 36000 3 320 2.0 95 5000 4 100 8.0 71 21260 Discharge_Quarter ClaimDuration IPAnnualDeductibleAmt \ 0 2.0 6 3204 1 3.0 2 3204 2 3.0 3 3204 3 1.0 8 1068 4 3.0 17 2136 ClmProcedureCode_1_Flag 0 0 1 1 2 0 3 1 4 1 [5 rows x 21 columns]
In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
data = {'Feature1': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'Feature2': [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
'PotentialFraud': [0, 0, 0, 0, 1, 1, 1, 0, 0, 1],
'Provider': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']}
final_df = pd.DataFrame(data)
X = final_df.drop(columns=['Provider', 'PotentialFraud'])
y = final_df['PotentialFraud']
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# Train model
rf_model = RandomForestClassifier(random_state=42, class_weight='balanced')
rf_model.fit(X_train, y_train)
# Predict and evaluate
y_pred = rf_model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
Classification Report:
precision recall f1-score support
0 1.00 1.00 1.00 1
1 1.00 1.00 1.00 1
accuracy 1.00 2
macro avg 1.00 1.00 1.00 2
weighted avg 1.00 1.00 1.00 2
In [34]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve, auc
# 1. AUC-ROC Curve
# Get predicted probabilities for the positive class (PotentialFraud = 1)
y_pred_proba = rf_model.predict_proba(X_test)[:, 1]
# Calculate ROC curve and AUC score
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
auc_score = roc_auc_score(y_test, y_pred_proba)
# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'AUC = {auc_score:.2f}')
plt.plot([0, 1], [0, 1], 'k--') # Diagonal line (random classifier)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()
print(f'AUC Score: {auc_score}')
# These are the most crucial steps
# 2. Precision-Recall Curve
# Calculate Precision-Recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
auc_pr = auc(recall, precision)
# Plot Precision-Recall curve
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, label=f'AUPRC = {auc_pr:.2f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()
print(f'AUPRC Score: {auc_pr}')
AUC Score: 1.0
AUPRC Score: 1.0
In [35]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
# Define the hyperparameter grid
param_grid = {
'n_estimators': [50, 100, 150],
'max_depth': [5, 10, 15, None],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
'max_features': ['sqrt', 'log2']
}
rf_random = RandomizedSearchCV(
estimator=RandomForestClassifier(random_state=42, class_weight='balanced'),
param_distributions=param_grid,
n_iter=10,
cv=3,
verbose=1,
random_state=42,
n_jobs=-1,
scoring='roc_auc'
)
# Fit the RandomizedSearchCV
rf_random.fit(X_train, y_train)
# Best hyperparameters
print("Best Hyperparameters:", rf_random.best_params_)
# Best model
best_rf_model = rf_random.best_estimator_
# Evaluate the best model (you can reuse the evaluation code from previous steps)
y_pred = best_rf_model.predict(X_test)
print(classification_report(y_test, y_pred))
y_pred_proba = best_rf_model.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, y_pred_proba)
print(f"AUC Score: {auc_score}")
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best Hyperparameters: {'n_estimators': 50, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': None}
precision recall f1-score support
0 1.00 1.00 1.00 1
1 1.00 1.00 1.00 1
accuracy 1.00 2
macro avg 1.00 1.00 1.00 2
weighted avg 1.00 1.00 1.00 2
AUC Score: 1.0
In [36]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
# Logistic Regression
logreg_model = LogisticRegression(random_state=42, class_weight='balanced', solver='liblinear')
logreg_model.fit(X_train, y_train)
y_pred_logreg = logreg_model.predict(X_test)
print("Logistic Regression:\n", classification_report(y_test, y_pred_logreg))
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, scale_pos_weight=(y_train.value_counts()[0] / y_train.value_counts()[1])) # scale_pos_weight handles imbalance
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
print("XGBoost:\n", classification_report(y_test, y_pred_xgb))
Logistic Regression:
precision recall f1-score support
0 0.50 1.00 0.67 1
1 0.00 0.00 0.00 1
accuracy 0.50 2
macro avg 0.25 0.50 0.33 2
weighted avg 0.25 0.50 0.33 2
XGBoost:
precision recall f1-score support
0 0.50 1.00 0.67 1
1 0.00 0.00 0.00 1
accuracy 0.50 2
macro avg 0.25 0.50 0.33 2
weighted avg 0.25 0.50 0.33 2
/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
/usr/local/lib/python3.11/dist-packages/xgboost/core.py:158: UserWarning: [13:17:59] WARNING: /workspace/src/learner.cc:740:
Parameters: { "use_label_encoder" } are not used.
warnings.warn(smsg, UserWarning)
/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
inpatient_df = pd.read_csv('/content/Train_Inpatientdata-1542865627584.csv')
outpatient_df = pd.read_csv('/content/Train_Outpatientdata-1542865627584.csv')
beneficiary_df = pd.read_csv('/content/Train_Beneficiarydata-1542865627584.csv')
labels_df = pd.read_csv('/content/Train-1542865627584.csv')
def create_date_features(df):
for col in ['ClaimStartDt', 'ClaimEndDt', 'AdmissionDt', 'DischargeDt']:
if col in df.columns:
df[col] = pd.to_datetime(df[col], errors='coerce')
df[col + '_year'] = df[col].dt.year
df[col + '_month'] = df[col].dt.month
df[col + '_day'] = df[col].dt.day
df[col + '_dayofweek'] = df[col].dt.dayofweek
if 'AdmissionDt' in df.columns and 'DischargeDt' in df.columns:
df['LengthOfStay'] = (df['DischargeDt'] - df['AdmissionDt']).dt.days
df['AdmitForDays'] = df['LengthOfStay']
return df
def create_claim_code_features(df):
for prefix in ['ClmDiagnosisCode_', 'ClmProcedureCode_']:
cols = [col for col in df.columns if col.startswith(prefix)]
if cols:
df[prefix + 'count'] = df[cols].count(axis=1)
return df
def create_beneficiary_features(df):
for prefix in ['ChronicCond_']:
cols = [col for col in df.columns if col.startswith(prefix)]
if cols:
df[prefix + 'count'] = (df[cols] == 'Yes').sum(axis=1)
if 'RenalDiseaseIndicator' in df.columns:
df['RenalDiseaseIndicator'] = df['RenalDiseaseIndicator'].str.lower() == 'yes'
return df
def create_age_bins(df):
if 'Age' in df.columns:
df['Age_bins'] = pd.cut(df['Age'], bins=[0, 65, 75, float('inf')], labels=[0, 1, 2])
return df
# Preprocess dataframes
inpatient_df = create_date_features(inpatient_df)
outpatient_df = create_date_features(outpatient_df)
inpatient_df = create_claim_code_features(inpatient_df)
outpatient_df = create_claim_code_features(outpatient_df)
beneficiary_df = create_beneficiary_features(beneficiary_df)
beneficiary_df = create_age_bins(beneficiary_df)
# Concatenate both inpatient and outpatient dataframes
claims_df = pd.concat([inpatient_df, outpatient_df], ignore_index=True)
# Merge claims with beneficiary data using BeneID
print("claims_df columns before merge:", claims_df.columns)
print("beneficiary_df columns before merge:", beneficiary_df.columns)
claims_merged = claims_df.merge(beneficiary_df, on='BeneID', how='left')
print("claims_merged columns after merge:", claims_merged.columns)
# Aggregate features by Provider
def aggregate_provider_features(df):
agg_funcs = {
'ClaimID': 'count',
'InscClaimAmtReimbursed': 'sum',
'DeductibleAmtPaid': 'sum',
'AdmitForDays': 'sum',
'DiagnosisGroupCode': 'nunique',
'ClmProcedureCode_count': 'sum',
'ClmDiagnosisCode_count': 'sum',
'LengthOfStay': 'sum',
'ChronicCond_count': 'sum',
'Age_bins': 'sum'
}
# Check if columns exist before aggregating
cols_to_agg = {col: agg_funcs[col] for col in agg_funcs if col in df.columns}
if not cols_to_agg:
return pd.DataFrame() # Return empty DataFrame if no columns to aggregate
agg_df = df.groupby('Provider').agg(cols_to_agg)
agg_df.columns = ['Provider_' + col if col != 'Provider' else col for col in agg_df.columns]
if 'InscClaimAmtReimbursed' in df.columns:
agg_df['Provider_Avg_InscClaimAmtReimbursed'] = df.groupby('Provider')['InscClaimAmtReimbursed'].mean()
if 'DeductibleAmtPaid' in df.columns:
agg_df['Provider_Avg_DeductibleAmtPaid'] = df.groupby('Provider')['DeductibleAmtPaid'].mean()
if 'AdmitForDays' in df.columns:
agg_df['Provider_Avg_AdmitForDays'] = df.groupby('Provider')['AdmitForDays'].mean()
return agg_df.reset_index()
provider_features = aggregate_provider_features(claims_merged)
final_df = provider_features.merge(labels_df, on='Provider', how='left')
final_df.drop(columns=['Provider'], inplace=True)
final_df['PotentialFraud'] = final_df['PotentialFraud'].map({'Yes': 1, 'No': 0})
X = final_df.drop(columns=['PotentialFraud'])
y = final_df['PotentialFraud'].astype(int)
numeric_cols = X.select_dtypes(include=['number']).columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numeric_cols),
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
],
remainder='passthrough'
)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# --- Impute missing values ---
imputer = SimpleImputer(strategy='mean') # Or 'median', 'most_frequent', etc.
# Create the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
('imputer', imputer)])
X_train = pipeline.fit_transform(X_train)
X_test = pipeline.transform(X_test)
# --- Models ---
# Logistic Regression
logreg_model = LogisticRegression(random_state=42, class_weight='balanced', solver='liblinear')
logreg_model.fit(X_train, y_train)
y_pred_logreg = logreg_model.predict(X_test)
y_proba_logreg = logreg_model.predict_proba(X_test)[:, 1] # Probabilities for class 1
print("Logistic Regression:\n", classification_report(y_test, y_pred_logreg))
# XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, scale_pos_weight=(y_train.value_counts()[0] / y_train.value_counts()[1]))
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
y_proba_xgb = xgb_model.predict_proba(X_test)[:, 1] # Probabilities for class 1
print("XGBoost:\n", classification_report(y_test, y_pred_xgb))
# --- Prediction on Unseen Data and CSV Creation ---
unseen_inpatient_df = pd.read_csv('/content/Unseen_Inpatientdata-1542969243754.csv')
unseen_outpatient_df = pd.read_csv('/content/Unseen_Outpatientdata-1542969243754.csv')
unseen_beneficiary_df = pd.read_csv('/content/Unseen_Beneficiarydata-1542969243754.csv')
unseen_labels_df = pd.read_csv('/content/Unseen-1542969243754.csv')
unseen_inpatient_df = create_date_features(unseen_inpatient_df)
unseen_outpatient_df = create_date_features(unseen_outpatient_df)
unseen_inpatient_df = create_claim_code_features(unseen_inpatient_df)
unseen_outpatient_df = create_claim_code_features(unseen_outpatient_df)
unseen_beneficiary_df = create_beneficiary_features(unseen_beneficiary_df)
unseen_beneficiary_df = create_age_bins(unseen_beneficiary_df)
unseen_claims_df = pd.concat([unseen_inpatient_df, unseen_outpatient_df], ignore_index=True)
unseen_claims_merged = unseen_claims_df.merge(unseen_beneficiary_df, on='BeneID', how='left')
unseen_provider_features = aggregate_provider_features(unseen_claims_merged)
# Align columns - VERY IMPORTANT
train_cols = X.columns
unseen_provider_features = unseen_provider_features.reindex(columns=train_cols, fill_value=0) # Use fill_value=0 for missing cols
unseen_processed = pipeline.transform(unseen_provider_features)
unseen_probabilities = xgb_model.predict_proba(unseen_processed)[:, 1]
unseen_predictions = xgb_model.predict(unseen_processed)
submission_df = pd.DataFrame({
'Provider': unseen_labels_df['Provider'],
'Probability': unseen_probabilities,
'PredictedClass': unseen_predictions
})
submission_df.to_csv('Vedant Avinash Patil_Submission.csv', index=False)
from google.colab import files
files.download('Vedant Avinash Patil_Submission.csv')
print("Submission file created: Vedant Avinash Patil_Submission.csv")
claims_df columns before merge: Index(['BeneID', 'ClaimID', 'ClaimStartDt', 'ClaimEndDt', 'Provider',
'InscClaimAmtReimbursed', 'AttendingPhysician', 'OperatingPhysician',
'OtherPhysician', 'AdmissionDt', 'ClmAdmitDiagnosisCode',
'DeductibleAmtPaid', 'DischargeDt', 'DiagnosisGroupCode',
'ClmDiagnosisCode_1', 'ClmDiagnosisCode_2', 'ClmDiagnosisCode_3',
'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5', 'ClmDiagnosisCode_6',
'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8', 'ClmDiagnosisCode_9',
'ClmDiagnosisCode_10', 'ClmProcedureCode_1', 'ClmProcedureCode_2',
'ClmProcedureCode_3', 'ClmProcedureCode_4', 'ClmProcedureCode_5',
'ClmProcedureCode_6', 'ClaimStartDt_year', 'ClaimStartDt_month',
'ClaimStartDt_day', 'ClaimStartDt_dayofweek', 'ClaimEndDt_year',
'ClaimEndDt_month', 'ClaimEndDt_day', 'ClaimEndDt_dayofweek',
'AdmissionDt_year', 'AdmissionDt_month', 'AdmissionDt_day',
'AdmissionDt_dayofweek', 'DischargeDt_year', 'DischargeDt_month',
'DischargeDt_day', 'DischargeDt_dayofweek', 'LengthOfStay',
'AdmitForDays', 'ClmDiagnosisCode_count', 'ClmProcedureCode_count'],
dtype='object')
beneficiary_df columns before merge: Index(['BeneID', 'DOB', 'DOD', 'Gender', 'Race', 'RenalDiseaseIndicator',
'State', 'County', 'NoOfMonths_PartACov', 'NoOfMonths_PartBCov',
'ChronicCond_Alzheimer', 'ChronicCond_Heartfailure',
'ChronicCond_KidneyDisease', 'ChronicCond_Cancer',
'ChronicCond_ObstrPulmonary', 'ChronicCond_Depression',
'ChronicCond_Diabetes', 'ChronicCond_IschemicHeart',
'ChronicCond_Osteoporasis', 'ChronicCond_rheumatoidarthritis',
'ChronicCond_stroke', 'IPAnnualReimbursementAmt',
'IPAnnualDeductibleAmt', 'OPAnnualReimbursementAmt',
'OPAnnualDeductibleAmt', 'ChronicCond_count'],
dtype='object')
claims_merged columns after merge: Index(['BeneID', 'ClaimID', 'ClaimStartDt', 'ClaimEndDt', 'Provider',
'InscClaimAmtReimbursed', 'AttendingPhysician', 'OperatingPhysician',
'OtherPhysician', 'AdmissionDt', 'ClmAdmitDiagnosisCode',
'DeductibleAmtPaid', 'DischargeDt', 'DiagnosisGroupCode',
'ClmDiagnosisCode_1', 'ClmDiagnosisCode_2', 'ClmDiagnosisCode_3',
'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5', 'ClmDiagnosisCode_6',
'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8', 'ClmDiagnosisCode_9',
'ClmDiagnosisCode_10', 'ClmProcedureCode_1', 'ClmProcedureCode_2',
'ClmProcedureCode_3', 'ClmProcedureCode_4', 'ClmProcedureCode_5',
'ClmProcedureCode_6', 'ClaimStartDt_year', 'ClaimStartDt_month',
'ClaimStartDt_day', 'ClaimStartDt_dayofweek', 'ClaimEndDt_year',
'ClaimEndDt_month', 'ClaimEndDt_day', 'ClaimEndDt_dayofweek',
'AdmissionDt_year', 'AdmissionDt_month', 'AdmissionDt_day',
'AdmissionDt_dayofweek', 'DischargeDt_year', 'DischargeDt_month',
'DischargeDt_day', 'DischargeDt_dayofweek', 'LengthOfStay',
'AdmitForDays', 'ClmDiagnosisCode_count', 'ClmProcedureCode_count',
'DOB', 'DOD', 'Gender', 'Race', 'RenalDiseaseIndicator', 'State',
'County', 'NoOfMonths_PartACov', 'NoOfMonths_PartBCov',
'ChronicCond_Alzheimer', 'ChronicCond_Heartfailure',
'ChronicCond_KidneyDisease', 'ChronicCond_Cancer',
'ChronicCond_ObstrPulmonary', 'ChronicCond_Depression',
'ChronicCond_Diabetes', 'ChronicCond_IschemicHeart',
'ChronicCond_Osteoporasis', 'ChronicCond_rheumatoidarthritis',
'ChronicCond_stroke', 'IPAnnualReimbursementAmt',
'IPAnnualDeductibleAmt', 'OPAnnualReimbursementAmt',
'OPAnnualDeductibleAmt', 'ChronicCond_count'],
dtype='object')
Logistic Regression:
precision recall f1-score support
0 0.98 0.88 0.93 981
1 0.43 0.86 0.58 101
accuracy 0.88 1082
macro avg 0.71 0.87 0.75 1082
weighted avg 0.93 0.88 0.90 1082
/usr/local/lib/python3.11/dist-packages/xgboost/core.py:158: UserWarning: [13:18:06] WARNING: /workspace/src/learner.cc:740:
Parameters: { "use_label_encoder" } are not used.
warnings.warn(smsg, UserWarning)
XGBoost:
precision recall f1-score support
0 0.96 0.94 0.95 981
1 0.54 0.63 0.58 101
accuracy 0.92 1082
macro avg 0.75 0.79 0.77 1082
weighted avg 0.92 0.92 0.92 1082
Submission file created: Vedant Avinash Patil_Submission.csv
In [ ]: